In [66]:
import pandas as pd
import matplotlib.pyplot as plt
import re
from scipy.sparse import csr_matrix, coo_matrix
import numpy as np
%matplotlib inline
In [67]:
ratings = pd.read_csv('BX-Book-Ratings.csv', encoding='iso-8859-1', sep = ';')
ratings.columns = ['user_id', 'isbn', 'book_rating']
books = pd.read_csv('BX-Books.csv', sep=';', encoding = 'iso-8859-1', dtype =str)
books["Book-Title"].nunique() == books["ISBN"].nunique()
book_dict = books[["Book-Title","ISBN"]].set_index("Book-Title").to_dict()["ISBN"]
books['new_isbn'] = books["Book-Title"].apply(lambda x: book_dict[x])
books["Book-Title"].nunique() == books["new_isbn"].nunique()
books['isbn'] = books['new_isbn']
del books['Image-URL-L']
del books['Image-URL-M']
del books['Image-URL-S']
del books['Book-Author']
del books['Publisher']
del books['ISBN']
del books['new_isbn']
newdf = ratings[ratings.book_rating>0]
joined = books.merge(newdf, on ='isbn')
print(newdf.shape)
In [92]:
bookinfo = pd.read_csv("goodreads_list_props.csv")
bookinfo2 = pd.read_csv("goodreads_list_props1.csv")
In [93]:
import pickle
bookinfo3 = pd.read_pickle("ibsn_features_full.pickle")
In [94]:
bookinfo.columns
Out[94]:
In [95]:
bookinfo2.columns
Out[95]:
In [96]:
bookinfo3.columns
Out[96]:
In [97]:
bookinfo3.columns = ['isbn13','description','no_of_pages','book_name']
In [98]:
bookinfo2.columns = bookinfo.columns
bookinfo = pd.concat([bookinfo,bookinfo2])
bookinfo = bookinfo[['isbn13','description','no_of_pages','book_name']]
bookinfo = pd.concat([bookinfo,bookinfo3])
bookinfo.drop_duplicates(inplace = True)
In [99]:
books.drop_duplicates(subset = 'isbn',inplace = True)
In [18]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
def is_isbn10_valid(isbn):
"""
Check ISBN-10 is valid.
Code Implementaion from:
http://en.wikipedia.org/wiki/International_Standard_Book_Number
"""
if len(isbn) != 10:
return False
if ((not isbn[0:9].isdigit()) or
((isbn[-1] != 'X') and (not isbn[-1].isdigit()))):
return False
result = sum((10 - i) * (int(x) if x != 'X' else 10)
for i, x in enumerate(isbn))
return result % 11 == 0
def is_isbn13_valid(isbn):
"""
Check ISBN-13 is valid.
Code Implemetation from:
http://en.wikipedia.org/wiki/International_Standard_Book_Number
"""
if len(isbn) != 13 or isbn.isdigit() is not True:
return False
check = (10 - (sum(int(digit) * (3 if idx % 2 else 1)
for idx, digit in enumerate(isbn[:12])) % 10)) % 10
return check == int(isbn[-1])
def isbn13_to_isbn10(isbn13_str):
"""
Convert ISBN-13 to ISBN-10.
"""
num = 11 - sum((10 - i) * (int(x))
for i, x in enumerate(isbn13_str[3:12])) % 11
if num == 10:
check_digit = 'X'
elif num == 11:
check_digit = 0
else:
check_digit = num
return isbn13_str[3:12] + str(check_digit)
def isbn10_to_isbn13(isbn10_str):
"""
Convert ISBN-10 to ISBN-13.
"""
check_digit = (
10 - (sum(int(digit) * (3 if idx % 2 else 1)
for idx, digit in enumerate('978' + isbn10_str[:9])
) % 10)) % 10
return '978' + isbn10_str[:9] + str(check_digit)
def isbn_converter(isbn):
"""
Convert isbn format to another format.
"""
if is_isbn10_valid(isbn):
result = isbn10_to_isbn13(isbn)
elif is_isbn13_valid(isbn):
result = isbn13_to_isbn10(isbn)
else:
return None
return result
if __name__ == "__main__":
for isbn_str in sys.argv[1:]:
the_result = isbn_converter(isbn_str)
if the_result:
print(the_result)
else:
print("Bad ISBN " + isbn_str)
In [19]:
isbn13 = []
for i in books['isbn']:
isbn13.append(isbn_converter(i))
In [25]:
books['isbn13'] = isbn13
In [26]:
books.dropna(subset = ['isbn13'],inplace = True)
bookinfo.dropna(subset = ['isbn13'],inplace = True)
In [27]:
mergedinfo = bookinfo.merge(books,on = 'isbn13',how = 'inner')
In [28]:
import re
def striphtml(data):
p = re.compile('<.*?>')
try:
return p.sub('', data)
except:
return None
In [29]:
mergedinfo['description'] = mergedinfo['description'].apply(lambda x: striphtml(x))
mergedinfo['description'] = mergedinfo['description'].str.strip()
mergedinfo['description'] = mergedinfo['description'].str.replace('“','').str.replace(',','').str.replace('"','')
In [30]:
from nltk.corpus import stopwords
# ...
filtereddesc = []
stops = set(stopwords.words("english"))
for desc in mergedinfo['description']:
try:
words = desc.split()
filtereddesc.append([word for word in words if word not in stops])
except:
filtereddesc.append(None)
In [31]:
mergedinfo['filtered_description'] = filtereddesc
In [32]:
wordlist = []
for descs in mergedinfo['filtered_description']:
sentence = []
if descs is not None:
for word in descs:
sentence.append(word)
wordlist.append(sentence)
Download google's word2vec model before running next line
In [33]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
In [34]:
genres = ['Science','Satire','Drama','Action','Romance','Mystery','Horror','Travel','Children','Religion','History','Biography','Autobiography','Fantasy']
In [35]:
scores = []
for desc in mergedinfo['filtered_description']:
if desc is not None:
gscore = []
for genre in genres:
simsum = 0
n = 0
for word in desc:
try:
simsum = simsum + model.similarity(word,genre)
n = n + 1
except:
continue
if n!=0:
gscore.append((simsum)/n)
else:
gscore.append(0)
scores.append(gscore)
else:
scores.append(None)
In [36]:
editedscores = []
for score in scores:
if score is not None:
editedscores.append(score)
else:
editedscores.append([0] * 14)
In [37]:
scoredf = pd.DataFrame(editedscores,columns = [genre + '_Score' for genre in genres])
In [38]:
bookfeatures = pd.concat([mergedinfo,scoredf],axis = 1)
AMAZON DATASET
In [53]:
newbooks = pd.read_csv("Combine.csv")
newbooksisbn = newbooks['isbn']
newbooksisbn13 = []
for i in newbooksisbn:
newbooksisbn13.append(isbn_converter(i))
newbooksuniqueisbn13 = list(set(newbooksisbn13))
amazonbookfeatures = bookinfo[bookinfo['isbn13'].isin(newbooksuniqueisbn13)]
In [44]:
amazonbookfeatures['description'] = amazonbookfeatures['description'].apply(lambda x: striphtml(x))
amazonbookfeatures['description'] = amazonbookfeatures['description'].str.strip()
amazonbookfeatures['description'] = amazonbookfeatures['description'].str.replace('“','').str.replace(',','').str.replace('"','')
In [45]:
filtereddesc = []
stops = set(stopwords.words("english"))
for desc in amazonbookfeatures['description']:
try:
words = desc.split()
filtereddesc.append([word for word in words if word not in stops])
except:
filtereddesc.append(None)
In [46]:
amazonbookfeatures['filtered_description'] = filtereddesc
wordlist = []
for descs in amazonbookfeatures['filtered_description']:
sentence = []
if descs is not None:
for word in descs:
sentence.append(word)
wordlist.append(sentence)
In [47]:
scores = []
for desc in amazonbookfeatures['filtered_description']:
if desc is not None:
gscore = []
for genre in genres:
simsum = 0
n = 0
for word in desc:
try:
simsum = simsum + model.similarity(word,genre)
n = n + 1
except:
continue
if n!=0:
gscore.append((simsum)/n)
else:
gscore.append(0)
scores.append(gscore)
else:
scores.append(None)
In [48]:
editedscores = []
for score in scores:
if score is not None:
editedscores.append(score)
else:
editedscores.append([0] * 14)
In [49]:
scoredf = pd.DataFrame(editedscores,columns = [genre + '_Score' for genre in genres])
In [61]:
amzbookfeatures = pd.concat([amazonbookfeatures.reset_index(drop=True),scoredf],axis = 1)